#!/usr/bin/env Rscript

#install R libs on server
#install.packages("dplyr", lib="~/mylibs/R")
#.libPaths("~/mylibs/R")
require(data.table)
library("dplyr")

#############################
############################# Monte Carlo Resampling/Bootstrapping
#############################

#before running this script, change to full local path to the replication data
wd1 <- "C:\\Users\\YFan\\OneDrive - NORCE\\NFood_replication\\"

crops = c("Corn", "Wheat", "Soybean", "Cotton", "Rice", "Sugarcane", "Tropical Corn", "Tropical Soybean")
scenario = c("RCP45 - RCP85", "SAI - RCP85", "MSB - RCP85", "CCT - RCP85")

#read MLR regression coefficients prepared by "Data2.MLR-analysis.R"
load(paste0(wd1,"/data/RData/MLR-pergridregression.Rdata")) 
#load(paste0(wd,"/data/RData/MLR-pergridregression-VIF.Rdata")) 
#dif.8crop.vif, coeff.i.samples.vif are selected samples with VIF<10; 
#SAI has more samples with VIF>10 removed than other cases, if using "dif.8crop.vif.wma", the tot.eff of SAI becomes slightly underestimated at the end of 21st century     
#thus keep using MLR coefficients of all samples from dif.8crop.s  
#98% samples are good; only 139 (2%) samples have VIF >10
#small collinearity does not affect the final prediction of total effect (also does not affect the relative scales of partial effects of T,R,M)  

#==Monte Carlo/Bootstrap resampling on total and partial effects of individual variables for each sample
#effect.i.samples <- coeff.i.samples.vif[dif.8crop.vif[Irrig==TRUE], 
effect.i.samples <- coeff.i.samples[dif.8crop.s[Irrig==TRUE],                                    
                                    .(Intercept, T.log=T*i.tsa.dif, RD.log=RD*i.radd.dif, RI.log=RI*i.radi.dif, 
                                      R.log=RD*i.radd.dif+RI*i.radi.dif, P.log=0, RH.log=0, M.log=0, 
                                      co2.log=i.co2.log, luc.log=i.luc.log, 
                                      Y.mod.dif=i.yield.dif, Y.mod.dif2=i.yield.dif+i.co2.log, Y.mod.dif3=i.yield.dif+i.co2.log+i.luc.log, 
                                      yield1=i.yield1, yield0=i.yield0, yield=i.yield, 
                                      area=i.area.wt, Year=i.Year), by=.EACHI, on=.(Scenario,Crop,Sample)]
#effect.r.samples <- coeff.r.samples.vif[dif.8crop.vif[Irrig==FALSE], 
effect.r.samples <- coeff.r.samples[dif.8crop.s[Irrig==FALSE], 
                                    .(Intercept, T.log=T*i.tsa.dif, R.log=RD*i.radd.dif+RI*i.radi.dif, M.log=P*i.ppt.dif+RH*i.rh.dif,
                                      RD.log=RD*i.radd.dif, RI.log=RI*i.radi.dif, P.log=P*i.ppt.dif, RH.log=RH*i.rh.dif, 
                                      co2.log=i.co2.log, luc.log=i.luc.log,
                                      Y.mod.dif=i.yield.dif, Y.mod.dif2=i.yield.dif+i.co2.log, Y.mod.dif3=i.yield.dif+i.co2.log+i.luc.log, 
                                      yield1=i.yield1, yield0=i.yield0, yield=i.yield, 
                                      area=i.area.wt, Year=i.Year), by=.EACHI, on=.(Scenario,Crop,Sample)]
effect.i.samples  <- effect.i.samples[,Tot.log:=T.log+RD.log+RI.log+Intercept] #if regression includes intercept, do not forget it here.
effect.r.samples  <- effect.r.samples[,Tot.log:=T.log+RD.log+RI.log+P.log+RH.log+Intercept]
effect.i.samples  <- effect.i.samples[,Tot.log2:=Tot.log+co2.log] #if regression includes intercept, do not forget it here.
effect.r.samples  <- effect.r.samples[,Tot.log2:=Tot.log+co2.log]
effect.i.samples  <- effect.i.samples[,Tot.log3:=Tot.log2+luc.log] #all effects
effect.r.samples  <- effect.r.samples[,Tot.log3:=Tot.log2+luc.log] #larger than adding luc.log to tot.log2 after aggregating to global
effect.i.samples  <- effect.i.samples[,Res.log:=Y.mod.dif-Tot.log] #residuals
effect.r.samples  <- effect.r.samples[,Res.log:=Y.mod.dif-Tot.log] 

#this is original wtm of grid sample effects, should be within Bootstrap or MC resample confidence interval
effect.i.samples$Irrig <- TRUE
effect.r.samples$Irrig <- FALSE
effect.s <- rbind(effect.i.samples, effect.r.samples)


#aggregate partial and total effects to global
#option 1: when weighted mean effect (in log) by area is calculated during each Bootstrap resampling (also do this for modeled yield differences Y.mod.dif, Y.mod.dif2..)
eff.log.wm <- effect.s[, .(T.log=weighted.mean(T.log, area, na.rm = T), RD.log=weighted.mean(RD.log, area, na.rm = T),RI.log=weighted.mean(RI.log, area, na.rm = T),
                           P.log=weighted.mean(P.log, area, na.rm = T),RH.log=weighted.mean(RH.log, area, na.rm = T),
                           R.log=weighted.mean(R.log, area, na.rm = T), M.log=weighted.mean(M.log, area, na.rm = T),
                           Tot.log=weighted.mean(Tot.log, area, na.rm = T), Y.mod.dif=weighted.mean(Y.mod.dif, area, na.rm = T), 
                           co2.log=weighted.mean(co2.log, area, na.rm = T),luc.log=weighted.mean(luc.log, area, na.rm = T), 
                           Tot.log2=weighted.mean(Tot.log2, area, na.rm = T), Y.mod.dif2=weighted.mean(Y.mod.dif2, area, na.rm = T),
                           Tot.log3=weighted.mean(Tot.log3, area, na.rm = T), Y.mod.dif3=weighted.mean(Y.mod.dif3, area, na.rm = T),
                           Res.log=weighted.mean(Res.log, area, na.rm = T),
                           yield1=weighted.mean(yield1, area, na.rm = T), yield0=weighted.mean(yield0, area, na.rm = T), yield=weighted.mean(yield, area, na.rm = T),
                           area=sum(area, na.rm=T)),
                       by =.(Scenario,Crop,Irrig,Year)]


eff.log.wm$Level <- "Mean"
(scenario<-unique(eff.log.wm$Scenario))

rm(effect.s)
rm(dif.8crop.s)



#=======bootstrap resampling to get probability distribution of partial and total climate effects
#library(boot)
library(dplyr)
library(rsample)
library(broom)
library(purrr) #use purrr::map to apply a function to all the bootstrap samples at once


#=define helper function to obtain crop weighted average climate effect per year using per-grid regression
bs.wtm <- function(split) {
  #=======resample irrig/rainfed crops together
  sample.s <- analysis(split)  #boots sample from one scenario within each crop type based on sample.data

  #==sample predicted effects using the sample ID from above
  eff.log.s <- data[sample.s[,c("Scenario","Crop","Sample")], on=.(Crop,Sample)] #use the same sample indices of each crop for all scenarios
  
  #==weighted by area gives best estimate of global effect
  eff.wm <- eff.log.s[, .(T.log=weighted.mean(T.log, area, na.rm = T), RD.log=weighted.mean(RD.log, area, na.rm = T),RI.log=weighted.mean(RI.log, area, na.rm = T),
                          P.log=weighted.mean(P.log, area, na.rm = T),RH.log=weighted.mean(RH.log, area, na.rm = T),
                          R.log=weighted.mean(R.log, area, na.rm = T), M.log=weighted.mean(M.log, area, na.rm = T),
                          co2.log=weighted.mean(co2.log, area, na.rm = T), luc.log=weighted.mean(luc.log, area, na.rm = T),
                          Tot.log=weighted.mean(Tot.log, area, na.rm = T), Tot.log2=weighted.mean(Tot.log2, area, na.rm = T), Tot.log3=weighted.mean(Tot.log3, area, na.rm = T),
                          Y.mod.dif=weighted.mean(Y.mod.dif, area, na.rm = T),Y.mod.dif2=weighted.mean(Y.mod.dif2, area, na.rm = T),Y.mod.dif3=weighted.mean(Y.mod.dif3, area, na.rm = T),
                          Res.log=weighted.mean(Res.log, area, na.rm = T),
                          yield1=weighted.mean(yield1, area, na.rm = T), yield0=weighted.mean(yield0, area, na.rm = T), yield=weighted.mean(yield, area, na.rm = T),
                          area=sum(area, na.rm=T)),
                      by =.(Scenario,Crop,Year)]
  
  return(eff.wm)
}


set.seed(27)
#sample.data = coeff.i.samples.vif[Scenario==scenario[1]] #only for getting initial sample ID of one scenario, later applied to all other scenarios
sample.data = coeff.i.samples[Scenario==scenario[1]]
boots.i <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop to obtain sample IDs
#sample.data = coeff.r.samples.vif[Scenario==scenario[1]]
sample.data = coeff.r.samples[Scenario==scenario[1]]
boots.r <- bootstraps(sample.data, times = 1000, strata=Crop) #stratified sampling per crop type
#str(analysis(boots$splits[[1]])) #analysis() convert a rsplit sample to data.frame

#===bootstrapping process (takes ~100mins for 1000 times)
data <- effect.i.samples #external input (log) to function bs.wtm; this is the real data to bootstrap
boot_stats_i <- boots.i %>%
  mutate(stats = map(splits, bs.wtm)) #splits is just for getting sample ID of one scenario based on sample.data
  #stats_tidy = map(stats, tidy))
data <- effect.r.samples #external input (log) to function bs.wtm
boot_stats_r <- boots.r %>%
  mutate(stats = map(splits, bs.wtm)) #eff.r.samples is the real data to bs.wtm

# save(list=c("boot_stats_i","boot_stats_r"), file = paste0(wd1,"/data/RData/MLR-pergrid-log-wtm-boot_stats_1000.Rdata"))

rm(effect.i.samples, effect.r.samples)



#===confidence interval for irrigated crops
alpha <- .05 #95% confidence interval
eff.i.low <- boot_stats_i %>%
  unnest(stats) %>%
  dplyr::group_by(Scenario, Crop, Year) %>%
  #dplyr::summarize(Conf.05 = quantile(Tot.log, alpha / 2, na.rm=T),
  #                 Conf.95 = quantile(Tot.log, 1 - alpha / 2, na.rm=T))
  dplyr::summarise_at(c("T.log","RD.log","RI.log","P.log","RH.log","R.log","M.log","co2.log","luc.log",
                        "Tot.log","Tot.log2","Tot.log3","Y.mod.dif","Y.mod.dif2","Y.mod.dif3","Res.log"), quantile, probs = alpha/2, na.rm = TRUE)
eff.i.high <- boot_stats_i %>%
  unnest(stats) %>%
  dplyr::group_by(Scenario, Crop, Year) %>%
  dplyr::summarise_at(c("T.log","RD.log","RI.log","P.log","RH.log","R.log","M.log","co2.log","luc.log",
                        "Tot.log","Tot.log2","Tot.log3","Y.mod.dif","Y.mod.dif2","Y.mod.dif3","Res.log"), quantile, probs = 1 - alpha/2, na.rm = TRUE)

eff.i.low$Level="Conf.L"
eff.i.high$Level="Conf.U"
eff.i.ci <- as.data.table(rbind(eff.i.low, eff.i.high))%>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  setorder(Scenario)


#===rainfed crops

alpha <- .05 #95% confidence interval (2.5th to 97.5th percentiles)
eff.r.low <- boot_stats_r %>%
  unnest(stats) %>%
  dplyr::group_by(Scenario, Crop, Year) %>%
  dplyr::summarise_at(c("T.log","RD.log","RI.log","P.log","RH.log","R.log","M.log","co2.log","luc.log",
                        "Tot.log","Tot.log2","Tot.log3","Y.mod.dif","Y.mod.dif2","Y.mod.dif3","Res.log"), quantile, probs = alpha/2, na.rm = TRUE)
eff.r.high <-boot_stats_r %>%
  unnest(stats) %>%
  dplyr::group_by(Scenario, Crop, Year) %>%
  dplyr::summarise_at(c("T.log","RD.log","RI.log","P.log","RH.log","R.log","M.log","co2.log","luc.log",
                        "Tot.log","Tot.log2","Tot.log3","Y.mod.dif","Y.mod.dif2","Y.mod.dif3","Res.log"), quantile, probs = 1 - alpha/2, na.rm = TRUE)

eff.r.low$Level="Conf.L"
eff.r.high$Level="Conf.U"
eff.r.ci <- as.data.table(rbind(eff.r.low, eff.r.high)) %>%
  mutate(Scenario = factor(Scenario, levels=scenario)) %>%
  setorder(Scenario)


#====final step: combine mean and confidence interval
eff.i.ci$Irrig=TRUE
eff.r.ci$Irrig=FALSE
eff.ci.boots <- rbind(eff.i.ci, eff.r.ci)
eff.ci.boots <- as.data.table(eff.ci.boots)

eff.log.wm$Level <- "Mean"
eff.log.boots <- rbind(eff.log.wm, eff.ci.boots, fill=TRUE)
#add mean base yield and area (from dif.8crop.s) to levels Conf.L and Conf.U
eff.log.boots <- eff.log.boots[eff.log.wm, yield1:=i.yield1, on=.(Scenario,Crop,Irrig,Year)]
eff.log.boots <- eff.log.boots[eff.log.wm, yield0:=i.yield0, on=.(Scenario,Crop,Irrig,Year)]
eff.log.boots <- eff.log.boots[eff.log.wm, yield:=i.yield, on=.(Scenario,Crop,Irrig,Year)]
eff.log.boots <- eff.log.boots[eff.log.wm, area:=i.area, on=.(Scenario,Crop,Irrig,Year)]


save(list=c("eff.log.wm", "eff.ci.boots", "eff.log.boots"), file = paste0(wd1,"/data/RData/MLR-pergrid-log-wtm-effect-Bootstrap-1000.Rdata"))


#========end of Monte Carlo resampling/bootstrapping============

